This is the work for machine faillure prediction. We are predicting, the state of the machine in one hour, accordding to what we learn from the predecedent data set.

First point is installation of the different packages and loading them if you haven’t them
#################################### Preparing the data set ################################################
# Installation of packages and loading them ###########################
#### Installation of necessary packages, need to do it only one time

#install.packages("corrplot")
# install.packages("caret")
#install.packages("randomForest")
# install.packages("MASS")
# install.packages("rpart")
# install.packages("e1071")
#install.packages("glmnet")
#install.pacakges("plotly")
#install.packages("missMDA")
#install.packages("pROC")
#install.packages("DMwR")
#install.packages("gbm")
# install.packages("rattle")
# install.packages("rpart.plot")
# install.packages("RColorBrewer")
# install.packages("party")
# install.packages("partykit")
#### Loading the necessary packages
library(pROC)  #for the roc curve methods
library("MASS")
library("rpart")
library("randomForest")
library("e1071")
library("glmnet")
library(plotly)
library(ggplot2)
library(missMDA)
library(caret)
library(DMwR)
library(rpart)                      
library(rattle)                 
library(rpart.plot)             
library(RColorBrewer)               
library(party)                  
library(partykit)               
library(caret)                  

then we set the directory loading the data set resampled in one hour interval

setwd("/home/moustapha/Energiency Big Data Project/Archive")
data = read.table("all1h2.csv", header = TRUE, sep = ",")
DateTS <- as.POSIXlt(data$X, format = "%Y-%m-%d %H:%M:%S")
data$X = DateTS ; colnames(data)[1] = "date" ;rownames(data) = data$date

First View of the Data

##       date                         prodh            elec      
##  Min.   :2012-12-31 23:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2013-10-20 21:45:00   1st Qu.:38.66   1st Qu.:20.58  
##  Median :2014-08-09 20:30:00   Median :42.98   Median :21.20  
##  Mean   :2014-08-09 20:57:44   Mean   :38.67   Mean   :19.93  
##  3rd Qu.:2015-05-29 19:15:00   3rd Qu.:45.82   3rd Qu.:21.89  
##  Max.   :2016-03-17 18:00:00   Max.   :52.61   Max.   :24.09  
##                                NA's   :1460    NA's   :1284   
##       gram           planstop          prod      
##  Min.   :     0   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:    42   1st Qu.:1.000   1st Qu.:37.74  
##  Median :    45   Median :1.000   Median :42.71  
##  Mean   : 46818   Mean   :0.802   Mean   :38.33  
##  3rd Qu.:    45   3rd Qu.:1.000   3rd Qu.:45.41  
##  Max.   :488110   Max.   :1.000   Max.   :51.77  
##  NA's   :22273    NA's   :17623   NA's   :22957

reshaping the gram for add it in the plot

##       date                         prodh            elec      
##  Min.   :2012-12-31 23:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2013-10-20 21:45:00   1st Qu.:38.66   1st Qu.:20.58  
##  Median :2014-08-09 20:30:00   Median :42.98   Median :21.20  
##  Mean   :2014-08-09 20:57:44   Mean   :38.67   Mean   :19.93  
##  3rd Qu.:2015-05-29 19:15:00   3rd Qu.:45.82   3rd Qu.:21.89  
##  Max.   :2016-03-17 18:00:00   Max.   :52.61   Max.   :24.09  
##                                NA's   :1460    NA's   :1284   
##       gram          planstop          prod      
##  Min.   : 0.00   Min.   :0.000   Min.   : 0.00  
##  1st Qu.:28.00   1st Qu.:1.000   1st Qu.:37.74  
##  Median :42.00   Median :1.000   Median :42.71  
##  Mean   :37.36   Mean   :0.802   Mean   :38.33  
##  3rd Qu.:45.00   3rd Qu.:1.000   3rd Qu.:45.41  
##  Max.   :61.50   Max.   :1.000   Max.   :51.77  
##  NA's   :22273   NA's   :17623   NA's   :22957

then selecting the Data set containing only the complete plan stop

##       date                         prodh            elec      
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2015-04-24 15:00:00   1st Qu.:37.43   1st Qu.:20.39  
##  Median :2015-08-12 00:00:00   Median :42.36   Median :20.66  
##  Mean   :2015-08-12 00:31:11   Mean   :37.88   Mean   :19.42  
##  3rd Qu.:2015-11-29 09:00:00   3rd Qu.:45.00   3rd Qu.:21.02  
##  Max.   :2016-03-17 18:00:00   Max.   :52.25   Max.   :22.98  
##                                NA's   :1361    NA's   :1259   
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:28.00   1st Qu.:1.0000   1st Qu.:37.74  
##  Median :42.00   Median :1.0000   Median :42.71  
##  Mean   :37.36   Mean   :0.8023   Mean   :38.33  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:45.41  
##  Max.   :61.50   Max.   :1.0000   Max.   :51.77  
##  NA's   :4650                     NA's   :5334

according to the summary and the plot we will consider the variables that have less missing values it is prodh and elec we will also consider the data up to “2016-01-26 07:00:00”

data1 = nomissing[1:which(nomissing$date == "2016-01-26 07:00:00"),]
summary(data1)
##       date                         prodh            elec      
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.:37.43   1st Qu.:20.39  
##  Median :2015-07-17 06:30:00   Median :42.36   Median :20.66  
##  Mean   :2015-07-17 06:57:21   Mean   :37.88   Mean   :19.42  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:45.00   3rd Qu.:21.02  
##  Max.   :2016-01-26 07:00:00   Max.   :52.25   Max.   :22.98  
##                                NA's   :126     NA's   :24     
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:22.51   1st Qu.:1.0000   1st Qu.:37.74  
##  Median :42.00   Median :1.0000   Median :42.71  
##  Mean   :36.87   Mean   :0.7849   Mean   :38.33  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:45.41  
##  Max.   :61.50   Max.   :1.0000   Max.   :51.77  
##  NA's   :3880                     NA's   :4099

we replace the missing values in prodh by the value in prod

##       date                         prodh            elec      
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.:37.43   1st Qu.:20.39  
##  Median :2015-07-17 06:30:00   Median :42.37   Median :20.66  
##  Mean   :2015-07-17 06:57:21   Mean   :37.89   Mean   :19.42  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:45.02   3rd Qu.:21.02  
##  Max.   :2016-01-26 07:00:00   Max.   :52.25   Max.   :22.98  
##                                NA's   :37      NA's   :24     
##       gram          planstop           prod      
##  Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:22.51   1st Qu.:1.0000   1st Qu.:37.74  
##  Median :42.00   Median :1.0000   Median :42.71  
##  Mean   :36.87   Mean   :0.7849   Mean   :38.33  
##  3rd Qu.:45.00   3rd Qu.:1.0000   3rd Qu.:45.41  
##  Max.   :61.50   Max.   :1.0000   Max.   :51.77  
##  NA's   :3880                     NA's   :4099

Lets check the proporportion of missing values in our Data set

##           
##            FALSE TRUE
##   date      9266    0
##   prodh     9229   37
##   elec      9242   24
##   gram      5386 3880
##   planstop  9266    0
##   prod      5167 4099

it still left some missing values in prodh “48” and elec “24”, we will input them with the method inputpca in missMDA.

## $ncp
## [1] 1
## 
## $criterion
##        0        1        2 
## 63.29179 27.23849 58.53466
##      prodh            elec          planstop     
##  Min.   : 0.00   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:37.45   1st Qu.:20.38   1st Qu.:1.0000  
##  Median :42.33   Median :20.66   Median :1.0000  
##  Mean   :37.89   Mean   :19.42   Mean   :0.7849  
##  3rd Qu.:45.00   3rd Qu.:21.01   3rd Qu.:1.0000  
##  Max.   :52.25   Max.   :22.98   Max.   :1.0000

Plot of the final data set whithout missing value

Second point Creation of the varible faillure that will be ou target

##       date                         prodh            elec      
##  Min.   :2015-01-05 06:00:00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:2015-04-11 18:15:00   1st Qu.:37.45   1st Qu.:20.38  
##  Median :2015-07-17 06:30:00   Median :42.33   Median :20.66  
##  Mean   :2015-07-17 06:57:21   Mean   :37.89   Mean   :19.42  
##  3rd Qu.:2015-10-21 18:45:00   3rd Qu.:45.00   3rd Qu.:21.01  
##  Max.   :2016-01-26 07:00:00   Max.   :52.25   Max.   :22.98  
##     planstop           State     
##  Min.   :0.0000   Faillure: 848  
##  1st Qu.:1.0000   Normal  :8418  
##  Median :1.0000                  
##  Mean   :0.7849                  
##  3rd Qu.:1.0000                  
##  Max.   :1.0000
## 
##  Faillure    Normal 
##  9.151738 90.848262
## 
##  Faillure       fix    Normal 
##  2.082884  7.068854 90.848262
## 
##  Faillure    Normal 
##  2.082884 97.917116

for the moment we have only five variables in our data set, lets create other ones from our data set, that we will use for our modelisations

Lets create the variable working time wich count the working time of the machine

worktime=c()
cou=0
for (i in 1:nrow(data2)){
  if(data2$State[i]=="Normal"){
    cou=cou+1
    worktime[i]=cou
  }
  else {
    cou=0
    worktime[i]=cou
  }
}

data2$worktime=worktime

Creation of varible, for consider the production and the electricity variation

Then we create Creation a function, for make a transformation on our variable (log(var+1), sqrt, power2,power3,poly)

we also make a Decomposition of the date in months and weekdays and adding it like variables

we make a function for new variable from time decalage

## a function for new variable from time decalage

varcreation = function(number,data,var) {
  n = length(data[,1])
  k = which(colnames(d1) == var)
  for (i in 1:number) {
    p = c(rep(NA,i), data[,k][-c((n - i + 1):n)])
    data[length(data) + 1] = p
  }
  return (data)
}

d1 = data2



## creation of decaled state
decal=function(var, numb=24){
  n = length(d1)
  numb = 24
  d1 = varcreation(numb,d1,var)
  namm1 = paste(rep(var,numb),c(1:numb),sep = ".")
  colnames(d1)[(n + 1):(n + numb)] <- paste(rep(var,numb),c(1:numb),sep = "..")
  return(d1)
}

tode=colnames(d1)[-c(1)]
for (i in 1:length(tode)){
  pp=decal(tode[i],numb = 24)
  d1=pp
}

## factor probleme management
numb=24
namm1 = paste(rep("State",numb),c(1:numb),sep = "..")
namm2 = paste(rep("ffail",numb),c(1:numb),sep = "..")
namm3 = paste(rep("ffail2",numb),c(1:numb),sep = "..")
namm4 = paste(rep("months",numb),c(1:numb),sep = "..")
namm5 = paste(rep("wday",numb),c(1:numb),sep = "..")
for (i in 1:numb) {
  d1[,which(colnames(d1) == namm1[i])] = as.factor(d1[,which(colnames(d1) == namm1[i])])
  levels(d1[,which(colnames(d1) == namm1[i])])=levels(d1$State)
  
  d1[,which(colnames(d1) == namm2[i])] = as.factor(d1[,which(colnames(d1) == namm2[i])])
  levels(d1[,which(colnames(d1) == namm2[i])])=levels(d1$ffail)
  
  d1[,which(colnames(d1) == namm3[i])] = as.factor(d1[,which(colnames(d1) == namm3[i])])
  levels(d1[,which(colnames(d1) == namm3[i])])=levels(d1$ffail2)
  
  d1[,which(colnames(d1) == namm4[i])] = as.factor(d1[,which(colnames(d1) == namm4[i])])
  levels(d1[,which(colnames(d1) == namm4[i])])=levels(d1$months)
  
  d1[,which(colnames(d1) == namm5[i])] = as.factor(d1[,which(colnames(d1) == namm5[i])])
  levels(d1[,which(colnames(d1) == namm5[i])])=levels(d1$wday)
}

creation of the working data set

## creation of the working data set

df = d1
df=df[,-c(1:3,8:28)]

df = na.omit(df)

## Using DF for fit our different models

x = df[,-c(2,3,4)]
y1 = df[,2]
y2 = df[,4]
y3 = df[,3]

df=data.frame(y1,y2,y3,x)

pp=lapply(x,as.numeric)
x=as.data.frame(pp)
rownames(x)=rownames(df)

Some interesting plot

## Faillure   Normal 
##      848     8393

## Faillure   Normal 
##      193     9048

## Faillure      fix   Normal 
##      193      655     8393

making the test and trainning splits

## ytrain1
##   Faillure     Normal 
## 0.09180835 0.90819165
## ytest1
##   Faillure     Normal 
## 0.09166366 0.90833634
## ytrain1T
## Faillure   Normal 
## 0.089375 0.910625
## ytest1T
##  Faillure    Normal 
## 0.1071716 0.8928284
## ytrain2
##   Faillure     Normal 
## 0.02179289 0.97820711
## ytest2
##   Faillure     Normal 
## 0.01876579 0.98123421
## ytrain2T
## Faillure   Normal 
##  0.02075  0.97925
## ytest2T
##   Faillure     Normal 
## 0.02175665 0.97824335

Creation of super-ressampling Data

new Proportion of Faillure in randomly case

## 
##  Faillure    Normal 
## 0.3076923 0.6923077

new Proportion of Faillure in time slice case

## 
##  Faillure    Normal 
## 0.3076923 0.6923077

After Data preparation, lets setup the environnement

Application of glm

############ Glm model

## fitting the control parameter
ctrl <- trainControl(method = "cv", number = 2, classProbs = TRUE)
glm.clas=function(xd,yd,ctrl){
  registerDoMC(cores = 5)
  model <- train(y=yd, x= xd, method = "glm",trControl = ctrl, family="binomial")
  return (model)
}


########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model glm
model=glm.clas(xtr, ytr,ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       13    105
##   Normal         39   2614
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)))

## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6057
mean(pred==ytest)
## [1] 0.9480332
sensibility
## [1] 0.25
specificity
## [1] 0.9613829
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model glm
model=glm.clas(xtr,ytr,ctrl)
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       20    567
##   Normal         32   2152
mean(pred==ytest)
## [1] 0.7838326
roc(as.numeric(ytest), as.numeric(pred))
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.588
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.588
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.3846154
specificity
## [1] 0.7914675
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model glm
model=glm.clas(xtr,ytr,ctrl)
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        5     13
##   Normal         22   1201
mean(pred==ytest)
## [1] 0.9717969
roc(as.numeric(ytest), as.numeric(pred))
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5872
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5872
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.1851852
specificity
## [1] 0.9892916
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model glm
model=glm.clas(xtr,ytr,ctrl)
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        9    169
##   Normal         18   1045
mean(pred==ytest)
## [1] 0.8493151
roc(as.numeric(ytest), as.numeric(pred))
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5971
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5971
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.3333333
specificity
## [1] 0.8607908
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

Application of SVM

##########     SVM radial      #########################
## fitting the control parameter
ctrl <- trainControl(method = "cv", number = 2, classProbs = TRUE)
svm.radial=function(xd,yd, ctrl){
  registerDoMC(cores = 5)
  model <- train(x=xd,y=yd,
                 method = "svmRadial",
                 trControl = ctrl,
                 preProc = c("center", "scale"),
                 metrics="ROC"
  )
  return (model)
}

########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model svm
model=svm.radial(xtr,ytr,ctrl)
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:modeltools':
## 
##     prior
## The following object is masked from 'package:ggplot2':
## 
##     alpha
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.03
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       19    232
##   Normal         33   2487
mean(pred==ytest)
## [1] 0.9043667
roc(as.numeric(ytest), as.numeric(pred))
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.64
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.64
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.3653846
specificity
## [1] 0.9146745
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model svm
model=svm.radial(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.24
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       27    309
##   Normal         25   2410
mean(pred==ytest)
## [1] 0.8794659
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.7028
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.5192308
specificity
## [1] 0.8863553
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model svm
model=svm.radial(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.03
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        8    146
##   Normal         19   1068
mean(pred==ytest)
## [1] 0.8670427
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.588
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.2962963
specificity
## [1] 0.8797364
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model svm
model=svm.radial(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.19
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.05,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       20    603
##   Normal          7    611
mean(pred==ytest)
## [1] 0.5084609
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.622
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.7407407
specificity
## [1] 0.5032949
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

Application of treebag model

### Tree bag model

ctrl <- trainControl(method = "cv", number = 2, classProbs = TRUE)
tbmodel=function(xd,yd, ctrl){
  registerDoMC(cores = 5)
  model <- train(x=xd,y=yd,
                 method = "treebag",
                 trControl = ctrl
  )
  return (model)
}


########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.28
# predicting


predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.05,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       21    335
##   Normal         31   2384
mean(pred==ytest)
## [1] 0.8679177
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6403
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.4038462
specificity
## [1] 0.8767929
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.36
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       34    609
##   Normal         18   2110
mean(pred==ytest)
## [1] 0.7737279
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.7149
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.6538462
specificity
## [1] 0.7760206
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.28
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        5      6
##   Normal         22   1208
mean(pred==ytest)
## [1] 0.9774376
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.5901
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.1851852
specificity
## [1] 0.9950577
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model tree bag model
model=tbmodel(xtr,ytr,ctrl)
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.32
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       13    187
##   Normal         14   1027
mean(pred==ytest)
## [1] 0.8380338
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6637
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.4814815
specificity
## [1] 0.8459638
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

Application of gbm model

##########     gbm      #########################
library(gbm)
ctrl = trainControl( method = "cv", number = 2, classProbs = TRUE)

gbmGrid <-  expand.grid(interaction.depth = c(1, 5), n.trees = c(1,15,30)*50, 
                        shrinkage = 0.1,
                        n.minobsinnode = 20)

gbm.model=function(xd,yd, ctrl){
  registerDoMC(cores = 5)
  model <- train(x=xd,y=yd,
                 method = "gbm",
                 trControl = ctrl,
                 tuneGrid=gbmGrid
  )
  return (model)
}


########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model gbm model
model=gbm.model(xtr,ytr,ctrl)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.1812            -nan     0.1000    0.0098
##      2        0.1767            -nan     0.1000    0.0007
##      3        0.1700            -nan     0.1000    0.0013
##      4        0.1675            -nan     0.1000   -0.0002
##      5        0.1656            -nan     0.1000   -0.0002
##      6        0.1625            -nan     0.1000   -0.0001
##      7        0.1590            -nan     0.1000    0.0002
##      8        0.1543            -nan     0.1000    0.0005
##      9        0.1516            -nan     0.1000    0.0001
##     10        0.1484            -nan     0.1000    0.0001
##     20        0.1260            -nan     0.1000    0.0001
##     40        0.0980            -nan     0.1000   -0.0001
##     60        0.0788            -nan     0.1000   -0.0002
##     80        0.0627            -nan     0.1000   -0.0001
##    100        0.0522            -nan     0.1000   -0.0001
##    120        0.0425            -nan     0.1000   -0.0001
##    140        0.0353            -nan     0.1000   -0.0001
##    160        0.0295            -nan     0.1000   -0.0000
##    180        0.0252            -nan     0.1000   -0.0001
##    200        0.0219            -nan     0.1000   -0.0000
##    220        0.0192            -nan     0.1000   -0.0000
##    240        0.0169            -nan     0.1000    0.0000
##    260        0.0151            -nan     0.1000   -0.0000
##    280        0.0134            -nan     0.1000   -0.0000
##    300        0.0118            -nan     0.1000   -0.0000
##    320        0.0105            -nan     0.1000   -0.0000
##    340        0.0092            -nan     0.1000   -0.0000
##    360        0.0083            -nan     0.1000   -0.0000
##    380        0.0073            -nan     0.1000   -0.0000
##    400        0.0066            -nan     0.1000   -0.0000
##    420        0.0060            -nan     0.1000    0.0000
##    440        0.0054            -nan     0.1000   -0.0000
##    460        0.0049            -nan     0.1000   -0.0000
##    480        0.0043            -nan     0.1000   -0.0000
##    500        0.0039            -nan     0.1000   -0.0000
##    520        0.0035            -nan     0.1000   -0.0000
##    540        0.0032            -nan     0.1000   -0.0000
##    560        0.0029            -nan     0.1000   -0.0000
##    580        0.0026            -nan     0.1000   -0.0000
##    600        0.0023            -nan     0.1000   -0.0000
##    620        0.0021            -nan     0.1000   -0.0000
##    640        0.0019            -nan     0.1000   -0.0000
##    660        0.0017            -nan     0.1000   -0.0000
##    680        0.0015            -nan     0.1000    0.0000
##    700        0.0014            -nan     0.1000   -0.0000
##    720        0.0013            -nan     0.1000   -0.0000
##    740        0.0011            -nan     0.1000   -0.0000
##    750        0.0011            -nan     0.1000   -0.0000
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.03
# predicting


predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.05,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       12     18
##   Normal         40   2701
mean(pred==ytest)
## [1] 0.9790689
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6121
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.2307692
specificity
## [1] 0.9933799
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model gbm model
model=gbm.model(xtr,ytr,ctrl)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.1487            -nan     0.1000    0.0413
##      2        1.0768            -nan     0.1000    0.0340
##      3        1.0205            -nan     0.1000    0.0258
##      4        0.9554            -nan     0.1000    0.0330
##      5        0.9031            -nan     0.1000    0.0236
##      6        0.8555            -nan     0.1000    0.0212
##      7        0.8183            -nan     0.1000    0.0171
##      8        0.7881            -nan     0.1000    0.0128
##      9        0.7617            -nan     0.1000    0.0118
##     10        0.7343            -nan     0.1000    0.0109
##     20        0.5529            -nan     0.1000    0.0035
##     40        0.3742            -nan     0.1000    0.0003
##     60        0.2888            -nan     0.1000   -0.0003
##     80        0.2247            -nan     0.1000   -0.0000
##    100        0.1799            -nan     0.1000   -0.0003
##    120        0.1487            -nan     0.1000   -0.0001
##    140        0.1222            -nan     0.1000   -0.0000
##    160        0.1007            -nan     0.1000    0.0000
##    180        0.0841            -nan     0.1000   -0.0001
##    200        0.0711            -nan     0.1000   -0.0001
##    220        0.0605            -nan     0.1000   -0.0001
##    240        0.0512            -nan     0.1000   -0.0001
##    260        0.0429            -nan     0.1000   -0.0000
##    280        0.0368            -nan     0.1000   -0.0001
##    300        0.0315            -nan     0.1000   -0.0000
##    320        0.0270            -nan     0.1000   -0.0000
##    340        0.0232            -nan     0.1000   -0.0000
##    360        0.0197            -nan     0.1000   -0.0000
##    380        0.0169            -nan     0.1000    0.0000
##    400        0.0146            -nan     0.1000   -0.0000
##    420        0.0124            -nan     0.1000   -0.0000
##    440        0.0108            -nan     0.1000   -0.0000
##    460        0.0093            -nan     0.1000    0.0000
##    480        0.0081            -nan     0.1000   -0.0000
##    500        0.0071            -nan     0.1000   -0.0000
##    520        0.0061            -nan     0.1000   -0.0000
##    540        0.0052            -nan     0.1000   -0.0000
##    560        0.0045            -nan     0.1000   -0.0000
##    580        0.0039            -nan     0.1000    0.0000
##    600        0.0034            -nan     0.1000   -0.0000
##    620        0.0029            -nan     0.1000   -0.0000
##    640        0.0025            -nan     0.1000   -0.0000
##    660        0.0022            -nan     0.1000   -0.0000
##    680        0.0019            -nan     0.1000   -0.0000
##    700        0.0016            -nan     0.1000   -0.0000
##    720        0.0014            -nan     0.1000   -0.0000
##    740        0.0012            -nan     0.1000   -0.0000
##    760        0.0010            -nan     0.1000   -0.0000
##    780        0.0009            -nan     0.1000   -0.0000
##    800        0.0008            -nan     0.1000   -0.0000
##    820        0.0007            -nan     0.1000   -0.0000
##    840        0.0006            -nan     0.1000   -0.0000
##    860        0.0005            -nan     0.1000   -0.0000
##    880        0.0004            -nan     0.1000   -0.0000
##    900        0.0004            -nan     0.1000   -0.0000
##    920        0.0003            -nan     0.1000   -0.0000
##    940        0.0003            -nan     0.1000   -0.0000
##    960        0.0002            -nan     0.1000   -0.0000
##    980        0.0002            -nan     0.1000   -0.0000
##   1000        0.0002            -nan     0.1000   -0.0000
##   1020        0.0001            -nan     0.1000    0.0000
##   1040        0.0001            -nan     0.1000   -0.0000
##   1060        0.0001            -nan     0.1000   -0.0000
##   1080        0.0001            -nan     0.1000   -0.0000
##   1100        0.0001            -nan     0.1000   -0.0000
##   1120        0.0001            -nan     0.1000   -0.0000
##   1140        0.0001            -nan     0.1000   -0.0000
##   1160        0.0001            -nan     0.1000   -0.0000
##   1180        0.0000            -nan     0.1000   -0.0000
##   1200        0.0000            -nan     0.1000   -0.0000
##   1220        0.0000            -nan     0.1000    0.0000
##   1240        0.0000            -nan     0.1000   -0.0000
##   1260        0.0000            -nan     0.1000   -0.0000
##   1280        0.0000            -nan     0.1000   -0.0000
##   1300        0.0000            -nan     0.1000   -0.0000
##   1320        0.0000            -nan     0.1000   -0.0000
##   1340        0.0000            -nan     0.1000   -0.0000
##   1360        0.0000            -nan     0.1000   -0.0000
##   1380        0.0000            -nan     0.1000   -0.0000
##   1400        0.0000            -nan     0.1000   -0.0000
##   1420        0.0000            -nan     0.1000    0.0000
##   1440        0.0000            -nan     0.1000   -0.0000
##   1460        0.0000            -nan     0.1000   -0.0000
##   1480        0.0000            -nan     0.1000   -0.0000
##   1500        0.0000            -nan     0.1000   -0.0000
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       17     83
##   Normal         35   2636
mean(pred==ytest)
## [1] 0.9574161
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6482
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.3269231
specificity
## [1] 0.9694741
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model gbm model
model=gbm.model(xtr,ytr,ctrl)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        0.1722            -nan     0.1000    0.0100
##      2        0.1678            -nan     0.1000    0.0009
##      3        0.1639            -nan     0.1000   -0.0001
##      4        0.1615            -nan     0.1000    0.0002
##      5        0.1584            -nan     0.1000   -0.0003
##      6        0.1563            -nan     0.1000    0.0002
##      7        0.1531            -nan     0.1000   -0.0000
##      8        0.1503            -nan     0.1000    0.0010
##      9        0.1478            -nan     0.1000    0.0004
##     10        0.1458            -nan     0.1000    0.0001
##     20        0.1279            -nan     0.1000   -0.0002
##     40        0.1048            -nan     0.1000    0.0004
##     60        0.0885            -nan     0.1000   -0.0002
##     80        0.0751            -nan     0.1000   -0.0001
##    100        0.0621            -nan     0.1000   -0.0002
##    120        0.0513            -nan     0.1000   -0.0000
##    140        0.0433            -nan     0.1000   -0.0000
##    160        0.0379            -nan     0.1000   -0.0000
##    180        0.0326            -nan     0.1000   -0.0001
##    200        0.0284            -nan     0.1000   -0.0000
##    220        0.0245            -nan     0.1000   -0.0000
##    240        0.0215            -nan     0.1000   -0.0000
##    260        0.0190            -nan     0.1000   -0.0000
##    280        0.0169            -nan     0.1000   -0.0000
##    300        0.0150            -nan     0.1000   -0.0000
##    320        0.0135            -nan     0.1000   -0.0000
##    340        0.0122            -nan     0.1000    0.0000
##    360        0.0109            -nan     0.1000   -0.0000
##    380        0.0099            -nan     0.1000   -0.0000
##    400        0.0089            -nan     0.1000   -0.0000
##    420        0.0081            -nan     0.1000    0.0000
##    440        0.0074            -nan     0.1000   -0.0000
##    460        0.0066            -nan     0.1000   -0.0000
##    480        0.0060            -nan     0.1000   -0.0000
##    500        0.0055            -nan     0.1000   -0.0000
##    520        0.0049            -nan     0.1000   -0.0000
##    540        0.0045            -nan     0.1000   -0.0000
##    560        0.0042            -nan     0.1000   -0.0000
##    580        0.0038            -nan     0.1000   -0.0000
##    600        0.0035            -nan     0.1000   -0.0000
##    620        0.0031            -nan     0.1000   -0.0000
##    640        0.0028            -nan     0.1000   -0.0000
##    660        0.0026            -nan     0.1000   -0.0000
##    680        0.0024            -nan     0.1000   -0.0000
##    700        0.0022            -nan     0.1000   -0.0000
##    720        0.0020            -nan     0.1000   -0.0000
##    740        0.0018            -nan     0.1000   -0.0000
##    750        0.0017            -nan     0.1000   -0.0000
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.03
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        6      8
##   Normal         21   1206
mean(pred==ytest)
## [1] 0.9766317
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6078
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.2222222
specificity
## [1] 0.9934102
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model gbm model
model=gbm.model(xtr,ytr,ctrl)
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.1382            -nan     0.1000    0.0446
##      2        1.0639            -nan     0.1000    0.0342
##      3        1.0015            -nan     0.1000    0.0291
##      4        0.9474            -nan     0.1000    0.0247
##      5        0.9043            -nan     0.1000    0.0213
##      6        0.8660            -nan     0.1000    0.0180
##      7        0.8322            -nan     0.1000    0.0160
##      8        0.8027            -nan     0.1000    0.0134
##      9        0.7710            -nan     0.1000    0.0143
##     10        0.7432            -nan     0.1000    0.0111
##     20        0.5682            -nan     0.1000    0.0038
##     40        0.4018            -nan     0.1000    0.0008
##     60        0.3131            -nan     0.1000   -0.0003
##     80        0.2508            -nan     0.1000   -0.0003
##    100        0.2052            -nan     0.1000   -0.0002
##    120        0.1712            -nan     0.1000   -0.0001
##    140        0.1425            -nan     0.1000   -0.0002
##    160        0.1196            -nan     0.1000   -0.0001
##    180        0.1000            -nan     0.1000   -0.0002
##    200        0.0853            -nan     0.1000   -0.0002
##    220        0.0730            -nan     0.1000   -0.0001
##    240        0.0625            -nan     0.1000    0.0000
##    260        0.0540            -nan     0.1000    0.0000
##    280        0.0464            -nan     0.1000   -0.0002
##    300        0.0405            -nan     0.1000   -0.0001
##    320        0.0351            -nan     0.1000   -0.0000
##    340        0.0306            -nan     0.1000   -0.0000
##    360        0.0268            -nan     0.1000   -0.0000
##    380        0.0234            -nan     0.1000   -0.0001
##    400        0.0204            -nan     0.1000   -0.0000
##    420        0.0178            -nan     0.1000   -0.0000
##    440        0.0158            -nan     0.1000   -0.0000
##    460        0.0139            -nan     0.1000   -0.0000
##    480        0.0122            -nan     0.1000   -0.0000
##    500        0.0106            -nan     0.1000   -0.0000
##    520        0.0092            -nan     0.1000   -0.0000
##    540        0.0081            -nan     0.1000   -0.0000
##    560        0.0071            -nan     0.1000    0.0000
##    580        0.0063            -nan     0.1000   -0.0000
##    600        0.0055            -nan     0.1000   -0.0000
##    620        0.0049            -nan     0.1000   -0.0000
##    640        0.0043            -nan     0.1000   -0.0000
##    660        0.0038            -nan     0.1000   -0.0000
##    680        0.0033            -nan     0.1000   -0.0000
##    700        0.0029            -nan     0.1000   -0.0000
##    720        0.0026            -nan     0.1000   -0.0000
##    740        0.0023            -nan     0.1000   -0.0000
##    760        0.0020            -nan     0.1000   -0.0000
##    780        0.0018            -nan     0.1000   -0.0000
##    800        0.0016            -nan     0.1000   -0.0000
##    820        0.0014            -nan     0.1000   -0.0000
##    840        0.0012            -nan     0.1000   -0.0000
##    860        0.0011            -nan     0.1000   -0.0000
##    880        0.0009            -nan     0.1000   -0.0000
##    900        0.0008            -nan     0.1000   -0.0000
##    920        0.0007            -nan     0.1000   -0.0000
##    940        0.0007            -nan     0.1000   -0.0000
##    960        0.0006            -nan     0.1000   -0.0000
##    980        0.0005            -nan     0.1000   -0.0000
##   1000        0.0004            -nan     0.1000   -0.0000
##   1020        0.0004            -nan     0.1000    0.0000
##   1040        0.0003            -nan     0.1000   -0.0000
##   1060        0.0003            -nan     0.1000   -0.0000
##   1080        0.0003            -nan     0.1000    0.0000
##   1100        0.0002            -nan     0.1000   -0.0000
##   1120        0.0002            -nan     0.1000   -0.0000
##   1140        0.0002            -nan     0.1000    0.0000
##   1160        0.0002            -nan     0.1000   -0.0000
##   1180        0.0001            -nan     0.1000   -0.0000
##   1200        0.0001            -nan     0.1000   -0.0000
##   1220        0.0001            -nan     0.1000   -0.0000
##   1240        0.0001            -nan     0.1000   -0.0000
##   1260        0.0001            -nan     0.1000   -0.0000
##   1280        0.0001            -nan     0.1000   -0.0000
##   1300        0.0001            -nan     0.1000   -0.0000
##   1320        0.0001            -nan     0.1000   -0.0000
##   1340        0.0001            -nan     0.1000   -0.0000
##   1360        0.0000            -nan     0.1000   -0.0000
##   1380        0.0000            -nan     0.1000   -0.0000
##   1400        0.0000            -nan     0.1000    0.0000
##   1420        0.0000            -nan     0.1000   -0.0000
##   1440        0.0000            -nan     0.1000   -0.0000
##   1460        0.0000            -nan     0.1000    0.0000
##   1480        0.0000            -nan     0.1000   -0.0000
##   1500        0.0000            -nan     0.1000    0.0000
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.01
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure        8     57
##   Normal         19   1157
mean(pred==ytest)
## [1] 0.9387591
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6247
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.2962963
specificity
## [1] 0.9530478
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

Application of bayes glm model

### bayesian linear model

ctrl <- trainControl(method = "cv", number = 2, classProbs = TRUE)
bayes.glm=function(xd,yd, ctrl){
  registerDoMC(cores = 5)
  model <- train(x=xd,y=yd,
                 method = "bayesglm",
                 trControl = ctrl
  )
  return (model)
}


########## the first case machine learning approches ####


#### Only the first faillure prediction
ytr=ytrain2
ytest=ytest2
xtr=xtrain
xte=xtest
# fitting the model bayesian linear model
model=bayes.glm(xtr,ytr,ctrl)
## Warning: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.02
# predicting


predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.05,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       24    234
##   Normal         28   2485
mean(pred==ytest)
## [1] 0.9054493
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6877
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.4615385
specificity
## [1] 0.9139389
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

#### smote data Prediction of the first faillure
ytr=dfSmote2$ytrain2
ytest=ytest2
xtr=dfSmote2[,-1]
xte=xtest
# fitting the model bayesian linear model
model=bayes.glm(xtr,ytr,ctrl)
## Warning: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.39
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       25    434
##   Normal         27   2285
mean(pred==ytest)
## [1] 0.8336341
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 52 controls (as.numeric(ytest) 1) < 2719 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6606
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.4807692
specificity
## [1] 0.8403825
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

############ second approch in case of time slice ####

#### Predicting all the faillures
# the second case with the first faillure ###
ytr=ytrain2T
ytest=ytest2T
xtr=xtrainT
xte=xtestT
# fitting the model bayesian linear model
model=bayes.glm(xtr,ytr,ctrl)
## Warning: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.02
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > optimalthr,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       14    217
##   Normal         13    997
mean(pred==ytest)
## [1] 0.8146656
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.6699
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.5185185
specificity
## [1] 0.8212521
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

###### time slice
#### Prediction of the first faillure # approch with smote data
ytr=dfSmote2T$ytrain2T
ytest=ytest2T
xtr=dfSmote2T[,-1]
xte=xtestT
# fitting the model bayesian linear model
model=bayes.glm(xtr,ytr,ctrl)
## Warning: fitted probabilities numerically 0 or 1 occurred
# seing the performance choosing the best threshold
fittingProba<-predict(model, xtr ,type = "prob")
optimalthr=opthr(fittingProba = fittingProba, ytr)

optimalthr
## [1] 0.31
# predicting

predProba<-predict(model, xte, type = "prob")
pred = as.factor(ifelse(predProba[1] > 0.2,"Faillure","Normal"))
sensibility = mean(pred[ytest == "Faillure"] == "Faillure")
specificity = mean(pred[ytest == "Normal"] == "Normal")
table(pred,ytest)
##           ytest
## pred       Faillure Normal
##   Faillure       17    228
##   Normal         10    986
mean(pred==ytest)
## [1] 0.8082192
auc=roc(as.numeric(ytest), as.numeric(pred))
plot(auc, ylim=c(0,1), print.thres=TRUE, main=paste('AUC:',round(auc$auc[[1]],2)), bg="white")
## 
## Call:
## roc.default(response = as.numeric(ytest), predictor = as.numeric(pred))
## 
## Data: as.numeric(pred) in 27 controls (as.numeric(ytest) 1) < 1214 cases (as.numeric(ytest) 2).
## Area under the curve: 0.7209
abline(h=1,col='blue',lwd=2)
abline(h=0,col='red',lwd=2)

sensibility
## [1] 0.6296296
specificity
## [1] 0.8121911
## the variables importance
varimp <- varImp(model, scale = FALSE)
plot(varimp, top=20)

descnum(varimp = varimp)

descnum(varimp = varimp, max = 1:10)

Applying the decision tree

tree.2 <- rpart(y2 ~ ., as.data.frame(x), method = "class")
fancyRpartPlot(tree.2)